Geographical Distribution of Restaurants
# reading in restaurants in Toronto file
restaurants_in_Toronto <- read_csv("~/EDAV_project/restaurants in Toronto.csv")
# reading in cuisine categories from text file
cuisine_cat <- read.delim("Cat_List_cuisines_only.txt", header = TRUE)
# subset restaurants dataset by categories in text file
restaurants <- restaurants_in_Toronto %>% filter(category %in% cuisine_cat$Category)
# finding 20 most common cuisines
cuisine_counts <- restaurants %>% group_by(category) %>% summarise(count=n()) %>% arrange(-count) %>% top_n(20,count)
mostcommon20_cuisines <- unique(cuisine_counts$category)
# filtering out all cuisines except for 20 most common cuisines in Toronto for mapping and graphing
cuisines_toronto <- restaurants %>% filter(category %in% mostcommon20_cuisines)
# confirm that there are no missing values in id, category, latitude, and longitude columns
skim(cuisines_toronto)
## Skim summary statistics
## n obs: 4290
## n variables: 13
##
## Variable type: character
## variable missing complete n min max empty n_unique
## address 12 4278 4290 8 64 0 3031
## category 0 4290 4290 4 22 0 20
## city 0 4290 4290 7 7 0 2
## id 0 4290 4290 22 22 0 3213
## name 0 4290 4290 2 46 0 2721
## neighborhood 835 3455 4290 7 29 0 71
## postal_code 9 4281 4290 3 7 0 1822
## state 0 4290 4290 2 2 0 1
##
## Variable type: integer
## variable missing complete n mean sd p0 p25 p50 p75 p100
## is_open 0 4290 4290 1 0 1 1 1 1 1
## review_count 0 4290 4290 59.81 94.51 3 10 29 69 1494
## hist
## ▁▁▁▇▁▁▁▁
## ▇▁▁▁▁▁▁▁
##
## Variable type: numeric
## variable missing complete n mean sd p0 p25 p50 p75
## latitude 0 4290 4290 43.69 0.049 43.59 43.65 43.67 43.71
## longitude 0 4290 4290 -79.39 0.066 -79.7 -79.42 -79.39 -79.37
## stars 0 4290 4290 3.45 0.71 1 3 3.5 4
## p100 hist
## 43.84 ▁▅▇▂▁▁▁▁
## -79.14 ▁▁▁▂▇▂▁▁
## 5 ▁▁▂▆▇▇▂▁
# analysis of NA values in neighborhood column by cuisine
skim(restaurants)
## Skim summary statistics
## n obs: 5389
## n variables: 13
##
## Variable type: character
## variable missing complete n min max empty n_unique
## address 13 5376 5389 8 64 0 3425
## category 0 5389 5389 4 22 0 94
## city 0 5389 5389 7 7 0 2
## id 0 5389 5389 22 22 0 3637
## name 0 5389 5389 2 46 0 3105
## neighborhood 1074 4315 5389 7 29 0 71
## postal_code 12 5377 5389 3 7 0 1998
## state 0 5389 5389 2 2 0 1
##
## Variable type: integer
## variable missing complete n mean sd p0 p25 p50 p75 p100
## is_open 0 5389 5389 1 0 1 1 1 1 1
## review_count 0 5389 5389 61.33 96.47 3 10 30 71 1494
## hist
## ▁▁▁▇▁▁▁▁
## ▇▁▁▁▁▁▁▁
##
## Variable type: numeric
## variable missing complete n mean sd p0 p25 p50 p75
## latitude 0 5389 5389 43.69 0.05 43.59 43.65 43.67 43.71
## longitude 0 5389 5389 -79.39 0.067 -79.7 -79.42 -79.39 -79.37
## stars 0 5389 5389 3.48 0.7 1 3 3.5 4
## p100 hist
## 43.88 ▁▇▅▂▂▂▁▁
## -79.14 ▁▁▁▂▇▂▁▁
## 5 ▁▁▂▅▇▇▂▁
cuisines_toronto_na <- cuisines_toronto %>% group_by(category) %>% summarise(num_nas = sum(is.na(neighborhood)))
cuisines_toronto_na <- plyr::join(cuisines_toronto_na, cuisine_counts, by="category")
cuisines_toronto_perc <- cuisines_toronto_na %>% mutate(perc_nas = num_nas/count)
ggplot(cuisines_toronto_perc, aes(x = category, y = perc_nas)) + geom_col() + theme(axis.text.x = element_text(angle=90, hjust=1))

ggplot(cuisine_counts, aes(x = category, y = count)) + geom_col() + theme(axis.text.x = element_text(angle=90, hjust=1))

Distribution of all restaurants in Toronto (removing duplicates)
cuisines_toronto_dedup <- cuisines_toronto[-13] %>% distinct()
qmplot(longitude, latitude, data = cuisines_toronto_dedup, maptype = "toner-lite", color = I("red"), size=I(0.5), alpha=I(.5)) + ggtitle("Restaurants in Toronto")

# mapping neighborhood NA values
cuisines_toronto_dedup <- cuisines_toronto[-13] %>% distinct()
qmplot(longitude, latitude, data = cuisines_toronto_dedup, maptype = "toner-lite", color = ifelse(is.na(neighborhood), I("NA"), I("Not NA")), size=I(0.5), alpha=I(.5)) + ggtitle("Restaurants in Toronto") + scale_color_colorblind() + theme(legend.title=element_blank())

# + scale_color_viridis(discrete=TRUE)
Distribution of restaurants in Toronto by cuisine
qmplot(longitude, latitude, data = cuisines_toronto, maptype = "toner-lite", color=I("red"), size=I(0.05), alpha=I(0.5)) + ggtitle("Restaurants in Toronto by Cuisine") + facet_wrap(~category)

Distribution of restaurants in Toronto by rating
cuisines_toronto_rating_dedup <- cuisines_toronto_dedup %>% mutate(rating = cut(stars, breaks=c(0,2,3,4,5), labels=c("Very Bad","Bad","Good","Very Good")))
qmplot(longitude, latitude, data = cuisines_toronto_rating_dedup, maptype = "toner-lite", color = I("red"), size=I(0.5), alpha=I(.5)) + ggtitle("Restaurants in Toronto by Rating") + facet_wrap(~rating)

Distribution of restaurants with “Good” and “Very Good” rating by cuisine
cuisines_toronto_rating <- cuisines_toronto %>% mutate(rating = cut(stars, breaks=c(0,2,3,4,5), labels=c("Very Bad","Bad","Good","Very Good")))
cuisines_toronto_good <- cuisines_toronto_rating %>% filter(rating %in% c("Good","Very Good"))
qmplot(longitude, latitude, data = cuisines_toronto_good, maptype = "toner-lite", color=I("red"), size=I(0.05), alpha=I(0.5)) + ggtitle('"Good" and "Very Good" Restaurants in Toronto by Cuisine') + facet_wrap(~category)

Distribution of restaurants in Toronto by Number of Check-ins
# reading in check-in data
check_in <- read_csv("~/EDAV_project/Data/restaurants checkins Toronto.csv")
check_in <- check_in %>% group_by(check_in$business_id) %>% summarise(n_checkins = sum(count, na.rm=TRUE))
colnames(check_in)[1] <- "id"
cuisines_toronto_check_ins_dedup <- plyr::join(cuisines_toronto_rating_dedup, check_in, by = "id")
# checking for missing values
skim(cuisines_toronto_check_ins_dedup$n_checkins)
## Skim summary statistics
##
## Variable type: integer
## variable missing complete n mean
## cuisines_toronto_check_ins_dedup$n_checkins 471 2742 3213 128.41
## sd p0 p25 p50 p75 p100 hist
## 258.14 1 13.25 44 137 5755 ▇▁▁▁▁▁▁▁
# remove restaurants where n_checkins is na
cuisines_toronto_check_ins_dedup <- cuisines_toronto_check_ins_dedup[!is.na(cuisines_toronto_check_ins_dedup$n_checkins),]
skim(cuisines_toronto_check_ins_dedup$n_checkins)
## Skim summary statistics
##
## Variable type: integer
## variable missing complete n mean
## cuisines_toronto_check_ins_dedup$n_checkins 0 2742 2742 128.41
## sd p0 p25 p50 p75 p100 hist
## 258.14 1 13.25 44 137 5755 ▇▁▁▁▁▁▁▁
# using summary provided by skim to choose cutoff points for checkin categories
skim(cuisines_toronto_check_ins_dedup$n_checkins)
## Skim summary statistics
##
## Variable type: integer
## variable missing complete n mean
## cuisines_toronto_check_ins_dedup$n_checkins 0 2742 2742 128.41
## sd p0 p25 p50 p75 p100 hist
## 258.14 1 13.25 44 137 5755 ▇▁▁▁▁▁▁▁
cuisines_toronto_check_ins_dedup <- cuisines_toronto_check_ins_dedup %>% mutate(checkin_cat = cut(n_checkins, breaks=c(-Inf,5,30,110,Inf), labels=c("Very Low","Low","High","Very High")))
qmplot(longitude, latitude, data = cuisines_toronto_check_ins_dedup, maptype = "toner-lite", color = I("red"), size=I(0.5), alpha=I(.5)) + ggtitle("Restaurants in Toronto by Number of Check-ins") + facet_wrap(~checkin_cat)

Distribution of restaurants with “High” and “Very High” number of check-ins by cuisine
# joining checkin data with cuisines_toronto_rating
cuisines_toronto_check_ins <- plyr::join(cuisines_toronto_rating, check_in, by = "id")
# checking for missing values
skim(cuisines_toronto_check_ins$n_checkins)
## Skim summary statistics
##
## Variable type: integer
## variable missing complete n mean sd
## cuisines_toronto_check_ins$n_checkins 498 3792 4290 155.11 339.98
## p0 p25 p50 p75 p100 hist
## 1 14 50 159 5755 ▇▁▁▁▁▁▁▁
# remove restaurants where n_checkins is na
cuisines_toronto_check_ins <- cuisines_toronto_check_ins[!is.na(cuisines_toronto_check_ins$n_checkins),]
cuisines_toronto_check_ins <- cuisines_toronto_check_ins %>% mutate(checkin_cat = cut(n_checkins, breaks=c(-Inf,5,30,110,Inf), labels=c("Very Low","Low","High","Very High")))
cuisines_toronto_high <- cuisines_toronto_check_ins %>% filter(checkin_cat %in% c("High","Very High"))
qmplot(longitude, latitude, data = cuisines_toronto_high, maptype = "toner-lite", color=I("red"), size=I(0.05), alpha=I(0.5)) + ggtitle('Restaurants in Toronto with "High" and "Very High" Number of Check-ins by Cuisine') + facet_wrap(~category)

Distribution of cuisines in Toronto by neighborhood
# remove na values in neighborhood column and graph number of restaurants by cuisine in each neighborhood with >= 70 restaurants
pop_neighborhood <- cuisines_toronto[!is.na(cuisines_toronto$neighborhood), ] %>%
group_by(neighborhood) %>%
mutate(neigh_pop =n()) %>%
filter(neigh_pop >=10) %>%
ungroup() %>%
filter(neigh_pop >= 70) %>%
group_by(neighborhood, category) %>%
summarize(count = n())
ggplot(pop_neighborhood, aes(x = category, y = count))+
geom_col()+
coord_flip()+
facet_wrap(~neighborhood)+
ggtitle("Distribution of Cuisines by Neighborhood")

Sentiment, rating, and check-ins
Relationship between sentiment, rating, and check-ins
# add sentiment and check-in data to restaurants dataset
# adding in check-in data
restaurants_full <- plyr::join(restaurants, check_in, by = "id")
# reading in sentiment data
sentiment_toronto <- read_csv("~/EDAV_project/Data/sentiment_reviews_Toronto.csv")
# rename column name with sentiment value
colnames(sentiment_toronto)[11] <- "sentiment"
# get average sentiment for each restaurant
mean_sentiment <- sentiment_toronto %>% group_by(business_id) %>% summarise(avg_sentiment = mean(sentiment))
# add sentiment information to restaurants dataset
colnames(mean_sentiment)[1] <- "id"
restaurants_full <- plyr::join(restaurants_full, mean_sentiment, by = "id")
restaurants_full_dedup <- restaurants_full[-13] %>% distinct()
# remove outlier in n_checkins
restaurants_full_dedup <- restaurants_full_dedup %>% subset(n_checkins<2500)
# remove na values in n_checkins and avg_sentiment
restaurants_full_dedup <- restaurants_full_dedup[!is.na(restaurants_full_dedup$n_checkins),]
restaurants_full_dedup <- restaurants_full_dedup[!is.na(restaurants_full_dedup$avg_sentiment),]
parallel_data <- restaurants_full_dedup[,c(10,13,14)]
colnames(parallel_data)[1] <- "Average Stars"
colnames(parallel_data)[2] <- "Number of Check-ins"
colnames(parallel_data)[3] <- "Average Sentiment"
# parallel coordinate plot of rating, sentiment, and check-ins
parcoords(parallel_data
, rownames = F
, brushMode = "1d-axes"
, reorderable = T
, color = list(
colorBy = "Average Stars"
, colorScale = htmlwidgets::JS("d3.scale.category10()")))
# ggparcoord(restaurants_full_dedup, columns = c(10,14,13), alphaLines = .1, scale = "uniminmax")
Star distribution for top 20 cuisine types
# read in restaurants in Toronto file as dataframe
restaurants <- read_csv("~/EDAV_project/restaurants in Toronto.csv")
# read in sentiment information
toronto = read_csv("~/EDAV_project/Data/sentiment_reviews_Toronto.csv")
# read in categories from text file
relevant_cats <- read.delim("Cat_List_cuisines_only.txt", header = TRUE)
# subset restaurants dataset by categories in text file
restaurants <- restaurants[restaurants$category %in% relevant_cats$Category,]
# obtain mapping of business ids and categories
categories <- restaurants[c(1,13)]
# remove user id from toronto sentiment dataset and rename "business id" column as "id"
data <- toronto[-1]
colnames(data)[1] <- "id"
# join toronto sentiment dataset (without user id) with restaurants dataset which has rating and other info
revs_cat <- plyr::join(data, restaurants, by = "id")
# remove rows with missing values in category column
revs_cat <- subset(revs_cat, !is.na(revs_cat$category))
# refactor category and stars column
revs_cat$category <- as.factor(revs_cat$category)
revs_cat$stars <- as.factor(revs_cat$stars)
# obtained counts by category and number of stars and percent for each star level within a category
temp <- revs_cat %>% group_by(revs_cat$category, revs_cat$stars) %>% summarise(cnt = n()) %>%
mutate(perc = round(cnt/sum(cnt),4))
most_freq <- revs_cat %>% group_by(revs_cat$category) %>% summarise(cnt = n())
top_20 <- head(most_freq[order(-most_freq$cnt),], 20)[1]
subset <- revs_cat[revs_cat$category %in% top_20$`revs_cat$category`,]
temp <- subset %>% group_by(subset$category, subset$stars) %>% summarise(cnt = n()) %>%mutate(perc = round(cnt/sum(cnt),4))
# plotting top 20 cuisine types
ggplot(data = temp, aes(temp$`subset$stars`, perc)) + geom_col(fill = "darkblue", colour = "black") + labs(y = "Cuisine Type", x = "Stars") + coord_flip() + facet_wrap(~`subset$category`, scales = "free") + ggtitle("Stars by Cuisine Type")

Review Sentiment of top 20 cusines
ggplot(subset, aes(category, text_1)) + geom_boxplot() + coord_flip() + labs(y = "Review Sentiment", x = "Cuisine Type")

Review sentiment of Top 5, Bottom 5, Middle 5
top_5 <- head(most_freq[order(-most_freq$cnt),], 5)[1]
top_5$indicator <- "Top_5"
bottom_5 <- tail(most_freq[order(-most_freq$cnt),], 5)[1]
bottom_5$indicator <- "Bottom_5"
middle_5 <- most_freq[round(nrow(most_freq)/2,0):(round(nrow(most_freq)/2,0) + 4),][1]
middle_5$indicator <- "Middle_5"
sample <- rbind(bottom_5, top_5, middle_5)
colnames(sample)[1] <- "category"
subset <- subset(revs_cat, revs_cat$category %in% sample$category)
subset <- plyr::join(subset, sample, by = "category")
ggplot(subset, aes(category, text_1)) + geom_boxplot() + coord_flip() + labs(y = "Review Sentiment", x = "Cuisine Type") + facet_wrap(~indicator, scales = "free_y")

Find cuisines with highest and lowest average ratings
revs_cat$stars <- as.numeric(revs_cat$stars)
revs_cat <- revs_cat[order(as.Date(revs_cat$date, format="%Y-%m-%d")),]
revs_cat$ma_rating <- rollmeanr(revs_cat[,3],7,fill = NA)
revs_cat$ma_rating[is.na(revs_cat$ma_rating)] <- mean(revs_cat$ma_rating, na.rm = TRUE)
rating_by_cuisine <- revs_cat %>% group_by(revs_cat$category) %>% summarise(avg_rating = mean(ma_rating,
na.rm = TRUE)) %>% arrange(desc(avg_rating))
top_10 <- head(rating_by_cuisine, 10)[1]
top_10$indicator <- "Top_10"
bottom_10 <- tail(rating_by_cuisine, 10)[1]
bottom_10$indicator <- "Bottom_10"
sample <- rbind(bottom_10, top_10)
subset <- subset(revs_cat, revs_cat$category %in% sample$`revs_cat$category`)
colnames(sample)[1] <- "category"
subset <- plyr::join(subset, sample, by = "category")
ggplot(subset, aes(category, ma_rating)) + geom_boxplot() + coord_flip() + labs(y = "Average Review", x =
"Cuisine Type") + facet_wrap(~indicator, scales="free_y")

Case study:
# read in dataset of all reviews for Toronto businesses, text_1 column is the sentiment value
# sentiment value calculated using python library
toronto = read_csv("~/EDAV_project/Data/sentiment_reviews_Toronto.csv")
# obtained number of reviews for each business id
most_rev <- data.frame(table(toronto$business_id))
# arranged business ids in decreasing order of number of reviews and converted business id column to character
most_rev <- most_rev[order(-most_rev$Freq),]
most_rev$Var1 <- as.character(most_rev$Var1)
# obtained business id with greatest number of reviews
var <- most_rev$Var1[1]
# obtain all reviews for business id with greatest number of reviews and order by date
reviews <- subset(toronto, business_id == var)
reviews <- reviews[order(as.Date(reviews$date, format="%Y-%m-%d")),]
# add year and month columns
reviews$year <- as.numeric(format(as.Date(reviews$date, format="%Y-%m-%d"),"%Y"))
reviews$month <- as.numeric(format(as.Date(reviews$date, format="%Y-%m-%d"),"%m"))
# convert elite column to dummy variable
reviews$elite <- as.factor(reviews$elite)
levels(reviews$elite) <- c(0,1)
# trim columns of review data frame
reviews <- reviews[c("business_id", "date", "stars", "elite", "text_1", "year", "month")]
reviews$date <- as.Date(reviews$date)
a) correlation between rating and sentiment
avg_sent <- reviews %>% group_by(stars) %>% summarize(mean = mean(text_1, na.rm = TRUE))
ggplot(avg_sent, aes(stars, mean)) + geom_col(color = "darkblue", fill = "darkblue") + xlab("Stars") +
ylab("Average Review Sentiment")

b) review sentiment of elite users
elite_sent <- toronto %>% group_by(elite) %>% summarize(mean = mean(text_1, na.rm = TRUE))
ggplot(elite_sent, aes(elite, mean)) + geom_col(color = "darkblue", fill = "darkblue", width = 0.5) + xlab("Elite") + ylab("Average Review Sentiment")

c) word clouds for good reviews (sentiment >= 0.5 stars) and bad reviews (sentiment <= -0.5)
setwd("~/EDAV_project/Data")
good_revs <- read_csv("good_revs.csv")
bad_revs <-read_csv("bad_revs.csv")
set.seed(1234)
wordcloud(words = good_revs$word, freq = good_revs$freq, min.freq = 300,
max.words=100, random.order=FALSE, rot.per=0.35,
colors=brewer.pal(8, "Dark2"))

wordcloud(words = bad_revs$word, freq = bad_revs$freq, min.freq = 5000,
max.words=100, random.order=FALSE, rot.per=0.35,
colors=brewer.pal(8, "Dark2"))

d) sentiment progression over time
# Stationary Sentiment Progression over Time
elite <- subset(reviews, reviews$elite == 1)
ggplot(reviews, aes(date, text_1)) + geom_line(color='grey') + scale_x_date(date_labels = ("%b-%Y"), limits =
as.Date(c(reviews$date[1], reviews$date[nrow(reviews)]))) + xlab("Date") + ylab("Sentiment") + ylim(-1,1) +
geom_point(data = elite, aes(date, text_1), color='darkblue', shape = 5, size = 0.5) + ggtitle("Sentiment over
Time")

reviews_2017 <- subset(reviews, reviews$date > as.Date("2017-06-01"))
# Stationary Graph for Sentiment Progression June - December 2017
elite <- subset(reviews_2017, reviews_2017$elite == 1)
ggplot(reviews_2017, aes(date, text_1)) + geom_line(color = 'grey') + scale_x_date(date_labels = ("%b-%Y"),
limits = as.Date(c(reviews_2017$date[1], reviews_2017$date[nrow(reviews_2017)]))) + xlab("Date") +
ylab("Sentiment") + ylim(-1,1) + geom_point(data = elite, aes(date, text_1), color='darkblue', shape = 5, size
= 0.5) + ggtitle("Sentiment Progression from June to December 2017")
